In [ ]:

    
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook

Out of core text classification with the Hashing Vectorizer

Using the Amazon movie reviews collected by J. McAuley and J. Leskovec

https://snap.stanford.edu/data/web-Movies.html



In [ ]:

    
import os
print("file size: %d GB" % (os.path.getsize("data/movies.txt") / 1024 ** 3))



In [ ]:

    
with open("data/movies.txt") as f:
    print(f.read(4000))



In [ ]:

    
def review_iter(f):
    current_post = []
    for line in f:
        if line.startswith("product/productId"):
            if len(current_post):
                score = current_post[3].strip("review/score: ").strip()
                review = "".join(current_post[6:]).strip("review/text: ").strip()
                # there are about 20 posts with linebreaks in them.
                # we just ignore those for simplicity
                try:
                    yield int(float(score)), review
                except:
                    current_post = []
                    continue
            current_post = []
        else:
            current_post.append(line)



In [ ]:

    
n_reviews = 0
with open("data/movies.txt", 'r', errors='ignore') as f:
    for r in review_iter(f):
        n_reviews += 1

print("Number of reviews: %d" % n_reviews)



In [ ]:

    
from itertools import islice

with open("data/movies.txt", 'rb') as f:
    reviews = islice(review_iter(f), 10000)
    scores, texts = zip(*reviews)
print(np.bincount(scores))



In [ ]:



In [ ]:

    
from itertools import zip_longest # use izip_longest on Python3
# from the itertools recipes
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)



In [ ]:

    
def preprocess_batch(reviews):
    # score == 3 is "neutral", we only want "positive" or "negative"
    reviews_filtered = [r for r in reviews if r is not None and r[0] != 3]
    scores, texts = zip(*reviews_filtered)
    polarity = np.array(scores) > 3
    return polarity, texts



In [ ]:

    
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(decode_error="ignore")

with open("data/movies.txt") as f:
    reviews = islice(review_iter(f), 10000)
    polarity_test, texts_test = preprocess_batch(reviews)
    X_test = vectorizer.transform(texts_test)



In [ ]:

    
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(random_state=0)

accuracies = []
with open("data/movies.txt") as f:
    training_set = islice(review_iter(f), 10000, None)
    batch_iter = grouper(training_set, 10000)
    for batch in batch_iter:
        polarity, texts = preprocess_batch(batch)
        X = vectorizer.transform(texts)
        sgd.partial_fit(X, polarity, classes=[0, 1])
        accuracies.append(sgd.score(X_test, polarity_test))



In [ ]:

    
plt.plot(accuracies)